# Creates a list containing testing and training dataframes
createTraining <- function(data, seed = 123, trainPercent = 0.8) {
set.seed(seed)
n <- nrow(data)
numTrain <- floor(trainPercent * n)
trainingRows <- sample(1:n, size = numTrain, replace = FALSE)
trainingData <- data[trainingRows, ]
testingData <- data[-trainingRows, ]
return(list(training = trainingData, testing = testingData))
}
# Creates confidence and prediction intervals
jjIntervals <- function(data, model) {
confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>%
rename(confLwr = lwr, confUpr = upr)
prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
rename(predictLwr = lwr, predictUpr = upr) %>%
select(predictLwr, predictUpr)
intervalData <- cbind(data,confidence,prediction)
return(intervalData)
}
# Creates a density plot given parameters
jjplotDensity <- function(data,x,fill,color) {
plot <- ggplot(data, aes(x={{x}})) +
geom_density(aes(fill={{fill}}), alpha=0.4)+
geom_rug(aes(color={{color}}), y=0) +
theme_custom() +
theme(legend.position = "none")
return(plot)
}
# Creates a boxplot given parameters
jjplotBoxplot <- function(data,x,y,fill) {
plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, fill = {{fill}})) +
geom_boxplot() +
coord_flip() +
theme_custom() +
theme(legend.position = "none")
return(plot)
}
# Creates a scatter plot
jjplotPoint <- function(data,x,y,color, model) {
data <- jjIntervals(data,model)
plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, color = {{color}})) +
geom_point() +
geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
theme_custom()
return(plot)
}
# Checks a model and gives back error and p-value
checkModel <- function(data, matrix) {
n <- nrow(data)
error <- (matrix[1,2] + matrix[2,1])/n
pHat <- (matrix[2,1]+matrix[2,2])/n
standardError <- sqrt(pHat*(1-pHat)/n)
pValue <- pnorm(error,pHat,standardError)
return(list("error" = error,"pValue" = pValue))
}
# Gets a prediction from the model
getPredict <- function(data, model) {
recidPredict <- predict.glm(model, newdata=data, type="response")
dataWithPredictions <- cbind(data,recidPredict)
return(dataWithPredictions)
}
# full data set
recid <- read.csv("datasets/Project3Sample4000.csv")
# mystery project data
recidMysteryBox <- read.csv("datasets/Project3Mystery100.csv")
Recidivism is a term used within the criminal justice system which means “the tendency of a criminal to reoffend after serving a sentence in a disciplinary institution.” The data we will be analyzing is from Broward County, Florida and includes recidivism predictions from two proprietary tests given to inmates. The goal of this analysis is to, based on various factors of an inmate, predict whether or not they will reoffend within two years of being released.
By the end of this analysis, we hope to have an accurate classification model for whether or not a person is likely to reoffend as well as have the ability to discuss the accuracy of the model in detail.
Finally, we hope to understand the ethical implications of the model we make and to know how to mitigate and/or measure the biases held by the model itself.
This task revolves around visualizing the data and making the data we are given usable. We clean the full data set found in the Project3Sample4000.csv file. This includes data cleaning, feature engineering, and data refining along with the creation of a testing training split.
## Data Cleaning
recid2 <- recid %>%
rename(
dayBefScreenArrest = days_b_screening_arrest,
jailIn = c_jail_in,
jailOut = c_jail_out,
daysFromCompas = c_days_from_compas,
chargeDegree = c_charge_degree,
chargeDesc = c_charge_desc,
riskRecidDecileScore = RiskRecidDecileScore,
riskRecidScoreLevel = RiskRecidScoreLevel,
riskRecidScreeningDate = RiskRecidScreeningDate,
riskViolenceDecileScore = RiskViolenceDecileScore,
riskViolenceScoreLevel = RiskViolenceScoreLevel
) %>%
mutate(
dob = as_date(dmy(dob)),
ageCat = as.factor(ageCat),
race = as.factor(race),
jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
chargeDegree = as.factor(gsub("[()]","",chargeDegree)),
riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate))
)
## Data Engineering
recid3 <- recid2 %>%
mutate(
daysInJail = as.numeric(difftime(jailOut,jailIn,unit="days")+1),
logDaysInJail = log10(daysInJail),
logPriorsCount = log10(priorsCount+0.1),
juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
logJuvCount = log10(juvCount+0.1)
)
## Data Removal
recid4 <- recid3 %>%
select (
-name,
-dob,
-race
)
## Testing Training Split
testingTraining <- createTraining(recid4, seed=8675309)
recidTraining <- testingTraining$training
recidTesting <- testingTraining$testing
In our data cleaning, we forced date variables to be dates and factor variables to be factors. Then we engineered a few categories in the data - some for convenience and some for purpose. These include the following: daysInJail (difference between entry and exit of jail), logDaysInJail (log base 10 of daysInJail), logPriorsCount (log base 10 of priorsCount), juvCount (total of all juvenile crime categories), and logJuvCount (log base 10 of juvCount). We then remove name, dob, and race because name is irrelevant to recidivism, dob is covered by the included age category, and race is not fair to include in a predictive model as there is no definitive difference aside from visually between two people of different races.
In this section we will be creating three different models based on the data set given and now refined. Our first model will be a logistic model to predict whether or not an inmate will reoffend within two years of their release. Our second model will predict the risk of recidivism score. This score was given by a written test given to prisoners and was computed by a third-party company that uses a black-box algorithm to compute the value of riskRecidScoreLevel. Our third model will predict the violence score of an inmate. This was once more calculated from a test administered to inmates and was computed by a third-party company using a black-box algorithm.
This is our first model. It is a logistic regression model which predicts whether or not an inmate will reoffend within two years of being released. The following plots are some data visualization relevant to this model.
### DaysInJail Plot
p1 <- recidTraining %>%
jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Days in Jail",
x = "Days in Jail"
)
p2 <- recidTraining %>%
jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Log10 of Days in Jail",
x = "log10(daysInJail)"
)
p3 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=daysInJail, fill=as.factor(isRecid)) +
labs(
title="Days in Jail",
y = "Days in Jail",
x = "Reoffence Prediction Proportion"
)
p4 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logDaysInJail, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
labs(
title="Log Base 10 of Days in Jail",
y = "log10(daysInJail)",
x = "Reoffence Prediction Proportion",
fill = "Reoffence Prediction Proportion"
)
p1 + p2 + p3 + p4 +
plot_annotation(
title = "Days in Jail and log10(Days in Jail)",
theme=theme_custom()
) + plot_layout(guides = 'collect')
Figure 1 illustrates that a higher proportion of inmates who spent less time in prison when compared to the proportion of prisoners likely to reoffend who spent a longer duration in jail. This is shown more clearly in the right two plots as the left two plots are so heavily skewed left that they are not very readable. The left two plots are included to demonstrate that taking the log base 10 of daysInJail eliminates much of the leftward skew therefore being a more sensitive predictor to be included within a model than simply daysInJail.
### Priors Count
p5 <- recidTraining %>%
jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Priors Counts",
x = "Priors Counts"
)
p6 <- recidTraining %>%
jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="log10(Priors Counts + 0.1)",
x = "log10(Priors Counts + 0.1)"
)
p7 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=priorsCount, fill=as.factor(isRecid)) +
labs(
title="Priors Counts",
y = "Priors Counts",
x = "Recidivated",
fill = "Recidivated"
)
p8 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logPriorsCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="log10(Priors Counts + 0.1)",
y = "log10(Priors Counts + 0.1)",
x = "Recidivated",
fill = "Recidivated"
)
p5 + p6 + p7 + p8 + plot_annotation(title = "Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 2 illustrates that taking the log base 10 of the number of prior offenses (plus 0.1 to avoid taking the log10 of 0) improves the sensitivity of the predictions made with that variable as well as reducing the number of outliers included in the data which means the model will better predict whether or not an inmate will reoffend after being released. For this reason, we will be using the log base 10 of prior count as opposed to just priorCount.
### Juvenile Priors Count
p9 <- recidTraining %>%
jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Juvenile Priors Counts",
x = "Juvenile Priors Counts"
)
p10 <- recidTraining %>%
jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="log10(Juvenile Priors Counts + 0.1)",
x = "log10(Juvenile Priors Counts + 0.1)"
)
p11 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=juvCount, fill=as.factor(isRecid)) +
labs(
title="Juvenile Priors Counts",
y = "Juvenile Priors Counts",
x = "Recidivated",
fill = "Recidivated"
)
p12 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=logJuvCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="log10(Juvenile Priors Counts + 0.1)",
y = "log10(Juvenile Priors Counts + 0.1)",
x = "Recidivated",
fill = "Recidivated"
)
p9 + p10 + p11 + p12 + plot_annotation(title = "Juvenile Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 3 shows that we can rule out prior crimes committed in juvenile years. There was not enough relevant data in these plots to include the variable or the log base 10 of the variable; therefore, this plot is important to include as a justification of refining our model later on. This is one of the few instances where we could simply rule out a variable this early in the process.
### Age
p13 <- recidTraining %>%
jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
labs(
title="Age",
x = "Juvenile Priors Counts"
)
p14 <- recidTraining %>%
jjplotBoxplot(x = isRecid, y=age, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
theme(legend.position = "right") +
labs(
title="Age",
x = "Age",
y = "Recidivated",
fill = "Recidivated"
)
p13 / p14 + plot_annotation(theme=theme_custom())
Figure 4 shows that age is a factor in whether or not a person will reoffend. Shown by the density and boxplots, we can determine that in this data, there is a higher proportion of reoffenders in younger populations and a higher proportion of non-reoffenders in older populations. The age cutoff in the trend is 35 years old. This informs us that we could potentially use age as a predictor in the model to predict recidivism.
### Sex
ggplot(data=recidTraining,aes(x=sex, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_bar(position = "dodge") +
labs(
title="Sex",
x = "Sex",
fill = "Recidivated"
) +
theme_custom()
Figure 5 takes into account sex as a factor determining reoffending rates. This plot, regarding reoffending rates between male and female populations, shows us that men have a higher probability in relation to male populations to reoffend than women have in relation to the population of women. This means that this variable would most likely be useful to the model; however, sex along with race will not be included for ethical reasons as it is not ethical or fair to judge someone more harshly based on their sex or race.
### ChargeDegree
ggplot(data=recidTraining,aes(x=chargeDegree, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_bar(position = "dodge") +
labs(
title="Charge Degree",
x = "Charge Degree",
fill = "Recidivated"
) +
theme_custom()
Figure 6 illustrates the difference in recidivism rate based on charge degree. Despite these being ordered properly from most severe crime to least severe crime, there does not seem to be a consistent pattern by eye. This means we can leave this variable in the initial model but it may be refined out later.
### Colinearity Check
p15 <- ggplot(recidTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Days In Jail) vs log10(Priors Count)",
x = "log10(Days In Jail)",
y = "log10(Priors Count)",
color = "Recidivated"
) +
theme_custom()
p16 <- ggplot(recidTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Days In Jail) vs Age",
x = "log10(Days In Jail)",
y = "Age",
color = "Recidivated"
) +
theme_custom()
p17 <- ggplot(recidTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
geom_point() +
labs(
title="log10(Priors Count) vs Age",
x = "log10(Priors Count)",
y = "Age",
color = "Recidivated"
) +
theme_custom()
p15 / (p16 + p17) + plot_annotation(title = "Colinearity Check", theme=theme_custom()) + plot_layout(guides = 'collect')
Figure 7 shows us that none of the three variables (age, logDaysInJail, and logPriorsCount) are colinear. The three subplots in conjunction show us that each of these three variables is not colinear with any of the others, and, therefore, all of them may be included in the model to potentially be refined out later.
## Everything Model
everythingModel <- glm(isRecid ~ age + priorsCount + daysInJail + logPriorsCount + logDaysInJail,data=recidTraining, family = binomial)
everythingPredictTrain <- getPredict(recidTraining, everythingModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
everythingMatrixTrain <- table(everythingPredictTrain$isRecid,everythingPredictTrain$prediction)
everythingPredictTest <- getPredict(recidTesting, everythingModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
everythingMatrixTest <- table(everythingPredictTest$isRecid,everythingPredictTest$prediction)
#checkModel(recidTraining,everythingMatrixTrain)
#checkModel(recidTesting,everythingMatrixTest)
## Basic Model
baseModel <- glm(isRecid ~ sex + age + priorsCount + daysInJail,data=recidTraining, family = binomial)
basePredictTrain <- getPredict(recidTraining, baseModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
baseMatrixTrain <- table(basePredictTrain$isRecid,basePredictTrain$prediction)
basePredictTest <- getPredict(recidTesting, baseModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
baseMatrixTest <- table(basePredictTest$isRecid,basePredictTest$prediction)
#checkModel(recidTraining,baseMatrixTrain)
#checkModel(recidTesting,baseMatrixTest)
## Log Model
logModel <- glm(isRecid ~ sex + age + logPriorsCount + logDaysInJail, data=recidTraining, family = binomial)
logPredictTrain <- getPredict(recidTraining, logModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
logMatrixTrain <- table(logPredictTrain$isRecid,logPredictTrain$prediction)
logPredictTest <- getPredict(recidTesting, logModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
logMatrixTest <- table(logPredictTest$isRecid,logPredictTest$prediction)
#checkModel(recidTraining,logMatrixTrain)
#checkModel(recidTesting,logMatrixTest)
## Recid Model
recidModel <- glm(isRecid ~ sex + age + logPriorsCount + daysInJail, data=recidTraining, family = binomial)
recidPredictTrain <- getPredict(recidTraining, recidModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recidMatrixTrain <- table(recidPredictTrain$isRecid,recidPredictTrain$prediction)
recidPredictTest <- getPredict(recidTesting, recidModel) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recidMatrixTest <- table(recidPredictTest$isRecid,recidPredictTest$prediction)
#checkModel(recidTraining,recidMatrixTrain)
#checkModel(recidTesting,recidMatrixTest)
## Recid 2 Model
recid2Model <- glm(isRecid ~ age + logPriorsCount + daysInJail, data=recidTraining, family = binomial)
recid2PredictTrain <- getPredict(recidTraining, recid2Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid2MatrixTrain <- table(recid2PredictTrain$isRecid,recid2PredictTrain$prediction)
recid2PredictTest <- getPredict(recidTesting, recid2Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid2MatrixTest <- table(recid2PredictTest$isRecid,recid2PredictTest$prediction)
#checkModel(recidTraining,recid2MatrixTrain)
#checkModel(recidTesting,recid2MatrixTest)
## Recid 3 Model
recid3Model <- glm(isRecid ~ age + logPriorsCount, data=recidTraining, family = binomial)
recid3PredictTrain <- getPredict(recidTraining, recid3Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid3MatrixTrain <- table(recid3PredictTrain$isRecid,recid3PredictTrain$prediction)
recid3PredictTest <- getPredict(recidTesting, recid3Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid3MatrixTest <- table(recid3PredictTest$isRecid,recid3PredictTest$prediction)
#checkModel(recidTraining,recid3MatrixTrain)
#checkModel(recidTesting,recid3MatrixTest)
## Recid 4 Model
recid4Model <- glm(isRecid ~ age + sex + logPriorsCount, data=recidTraining, family = binomial)
recid4PredictTrain <- getPredict(recidTraining, recid4Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid4MatrixTrain <- table(recid4PredictTrain$isRecid,recid4PredictTrain$prediction)
recid4PredictTest <- getPredict(recidTesting, recid4Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid4MatrixTest <- table(recid4PredictTest$isRecid,recid4PredictTest$prediction)
#checkModel(recidTraining,recid4MatrixTrain)
#checkModel(recidTesting,recid4MatrixTest)
recidMysteryBox <- read.csv("datasets/Project3Mystery100.csv")
## Recid 5 Model
recid5Model <- glm(isRecid ~ age + priorsCount, data=recidTraining, family = binomial)
recid5PredictTrain <- getPredict(recidTraining, recid5Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid5MatrixTrain <- table(recid5PredictTrain$isRecid,recid5PredictTrain$prediction)
recid5PredictTest <- getPredict(recidTesting, recid5Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid5MatrixTest <- table(recid5PredictTest$isRecid,recid5PredictTest$prediction)
#checkModel(recidTraining,recid5MatrixTrain)
#checkModel(recidTesting,recid5MatrixTest)
## Recid 6 Model
recid6Model <- glm(isRecid ~ age + logPriorsCount + logDaysInJail, data=recidTraining, family = binomial)
recid6PredictTrain <- getPredict(recidTraining, recid6Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid6MatrixTrain <- table(recid6PredictTrain$isRecid,recid6PredictTrain$prediction)
recid6PredictTest <- getPredict(recidTesting, recid6Model) %>%
mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid6MatrixTest <- table(recid6PredictTest$isRecid,recid6PredictTest$prediction)
#stored for use in markdown
trainList6 <- checkModel(recidTraining,recid6MatrixTrain)
trainError <- trainList6$error
trainPVal <- trainList6$pValue
testList6 <- checkModel(recidTesting,recid6MatrixTest)
testError <- testList6$error
testPVal <- testList6$pValue
matrix = recid6MatrixTest
manualErrorValue <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
coeffs = recid6Model$coefficients
nullHyp = (matrix[2,1]+matrix[2,2])/nrow(recidTesting)
## Final Model
finalModel <- recid6Model
The model we made from this data to predict whether or not an inmate will reoffend without the use of the proprietary recidivation and violence scores included in the data is as follows:
\[willReoffend = \lfloor 0.596374 + -0.0367996 \cdot age + \]
\[0.8183817 \cdot log_{10}(priorsCount) + 0.0302903 \cdot log_{10}(daysInJail) \rceil\]
The model equation above has a mixed pairing of one floor and one ceiling symbol, this is intended to communicate proper rounding. These model statistics will be based off the testing data set as we only care how well it can predict unknown value, not how well the model we have fits the data it was made from.
Some model coefficients listed above are a bit obscure in their effect. For example, the coefficients of the logged values indicated that the prediction (ideally between zero and one but rounded) will go up or down according to every power of ten included in the original variable. This is in contrast to the coefficient for the linear term, age, which indicates that the prediciton will go down by 0.03 likelihood to reoffend for every year of age the inmate has.
This model, when demonstrated on testing data, has a prediction error of \(0.27875\) meaning that approximately \(27.88\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(27.88\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.
This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{150 + 73}{800}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.27875.
This model’s p-value regarding the testing data is \(0.001304\). When this value is below 0.05, that means the model fitted to the data is statistically significant as it has a less than \(5\%\) chance to occur randomly given random data whereas if it is above 0.05 it is not statistically significant because it has a greater than \(5\%\) chance to occur randomly given random data. This p-value was created by a function located at the top of the markdown; however, it was calculated with a one-tail hypothesis test comparing two proportions. The null model states that \(33\%\) of inmates will recidivate as this is the percentage of inmates in the data that do recidivate. The alternate hypothesis is that the recidivation rate is something other than this. The two proportions used for this hypothesis test are the error, which is the number of people who were incorrectly predicted over the number of total people, and the p-hat value, which is the total number of people who reoffended in the selected data set over the total number of people in that data set. The standard error used was from a standard calculation of standard error given population size and a single proportion.